AirBNB Data Set Analysis
Import Libraries
import pandas as pd
import geopandas as gp
import matplotlib
import matplotlib.pyplot as plt
import requests
import os
import fiona
import numpy as np
import shapely
from datetime import datetime
from shapely.geometry import Point
Convert data stored in .csv files in cwd into dictionary objects
listings = pd.read_csv("listings.csv")
review = pd.read_csv("reviews.csv")
neighbourhoods = pd.read_csv("neighbourhoods.csv")
Creating map given with neighbourhoods.geojson
# Stores geo data
suburbs = gp.read_file("neighbourhoods.geojson")
# Scales map plot size
%matplotlib inline
# Updates matplotlib fontsize
plt.rcParams.update({'font.size': 22})
# Updates matplotlib graph display size
plt.rcParams['figure.figsize'] = (35, 50)
# Adds a centroid / small circle in the middle of a geometry shape
# Necessary to plot location names in the map
def add_centroid(row):
return row.geometry.centroid
# Function to add a geometry column into listings
# storing a Shapely Point
def newGeometry(data_list):
geo_list = []
for x in range(len(data_list["longitude"])):
point = Point(data_list["longitude"][x], data_list["latitude"][x])
geo_list.append(point)
data_list["geometry"] = geo_list;
return data_list
# Set centroid to print location name later
suburbs["centroid"] = suburbs.apply(add_centroid, axis=1)
# Update listings with geometry column for plotting purposes
listings = newGeometry(listings)
# Converts listings DataFrame into Geopanda DataFrame and store it in geo_listings
# geo_listings stores geometry information of available air_bnb rental locations
geo_listings = gp.GeoDataFrame(listings)
# Plot Map
suburbs.plot(cmap="cool")
# Prints suburb name
for idx, row in suburbs.iterrows():
plt.annotate(s=row.neighbourhood, xy=tuple(row.centroid.coords)[0],
horizontalalignment='center')
plt.figtext(.5,0.8,"NSW Suburb Map",fontsize=40,ha='center')
Plots all available AirBnb Rental Locations
# IMPORTANT SETUP to ensure we are able to plot two DataFrame in the same graph
f, ax = plt.subplots(1, figsize=(40, 50))
ax.set_axis_off()
plt.axis('equal')
suburbs.plot(ax=ax, cmap='cool', linewidth=0.5)
geo_listings.plot(markersize=10, categorical=True, legend=True, ax=ax, edgecolor='black');
plt.figtext(.5,.85,"Available AirBnb Rentals in NSW",fontsize=50,ha='center')
Graph to display Types of AirBnB Rentals
# Function to create a list of dictionaries containing
# different Room Type Rentals and the total count of each
# available room type and price
def createRoomList(main_data):
# List to return, contains information for Room Types
rlist = []
for x in range(len(main_data["room_type"])):
if checkExistingRoom(main_data["room_type"][x], rlist) == 0:
new_dict = {
"rtype" : main_data["room_type"][x],
"count" : 1
}
rlist.append(new_dict)
else:
rlist = addRoomCount(rlist, main_data["room_type"][x])
return rlist
# Function to add room count to associated room type
def addRoomCount(rlist, rtype):
for x in rlist:
if x["rtype"] == rtype:
x["count"] += 1
return rlist
# Function to check whether a Room Type exist already in a given
# list (rlist in this case), if exist return 1 else return 0
def checkExistingRoom(rtype, rlist):
# For list is empty
if not rlist:
return 0
for x in rlist:
if x["rtype"] == rtype:
return 1
return 0
room_data = createRoomList(listings)
room_data = pd.DataFrame(room_data)
# numply plot stuff
# Set number of data points to plot with "nump"
nump_row = np.arange(len(room_data))
plt.bar(nump_row, room_data["count"])
plt.xticks(nump_row, room_data["rtype"])
Plot Avg Room Price for a Room Type & Associated Suburb
# Store all available room type into rt_list
rt_list = room_data["rtype"]
# Function to sot listings into suburbs associated with a room type
def sortSuburb(main_data, rt):
rlist = []
for x in range(len(main_data["room_type"])):
if checkExistingSuburb(main_data["neighbourhood"][x], rlist) == 0 and main_data["room_type"][x] == rt:
new_dict = {
"location" : main_data["neighbourhood"][x],
"price": round(main_data["price"][x]/main_data["minimum_nights"][x]),
"count": 1,
"rtype": main_data["room_type"][x]
}
rlist.append(new_dict)
elif checkExistingSuburb(main_data["neighbourhood"][x], rlist) == 1 and main_data["room_type"][x] == rt:
days = main_data["minimum_nights"][x]
rlist = addSuburbCount(rlist, main_data["neighbourhood"][x], main_data["price"][x], days)
rlist = avgPrice(rlist)
return rlist
# Function to count average price for different suburbs & associated room type
def avgPrice(rlist):
for x in rlist:
x["price"] = round(x["price"] / x["count"])
return rlist
# Function to add room count & price to associated room type
def addSuburbCount(rlist, suburb, price, days):
daily_price = round(price/days)
for x in rlist:
if x["location"] == suburb:
x["count"] += 1
x["price"] += daily_price
return rlist
# Function to check whether a Suburb exist already in a given
# list (rlist in this case), if exist return 1 else return 0
def checkExistingSuburb(loc, rlist):
# For list is empty
if not rlist:
return 0
for x in rlist:
if x["location"] == loc:
return 1
return 0
tmp = sortSuburb(listings, rt_list[0])
print_data = pd.DataFrame(tmp)
print_data.rename(columns={"count": "Available Rooms", "location": "Location", "price": "Avg. Price per night", "rtype": "Room Type"}, inplace=True)
print_data
# Plotting avg_cost_data on a graph
plot_data = pd.DataFrame(tmp)
plot_data
plt.plot(plot_data["location"], plot_data["price"])
plt.xticks(plot_data["location"], rotation='vertical', fontsize=25)
plt.ylabel("Price", fontsize=50, labelpad=50)
plt.title("Private Room Avg. Price", fontsize=50, pad=30)
plt.figure(figsize=(10,10))
tmp = sortSuburb(listings, rt_list[1])
print_data = pd.DataFrame(tmp)
print_data.rename(columns={"count": "Available Rooms", "location": "Location", "price": "Avg. Price per night", "rtype": "Room Type"}, inplace=True)
print_data
# Plotting avg_cost_data on a graph
plot_data = pd.DataFrame(tmp)
plot_data
plt.plot(plot_data["location"], plot_data["price"])
plt.xticks(plot_data["location"], rotation='vertical', fontsize=25)
plt.ylabel("Price", fontsize=50, labelpad=50)
plt.title("Entire Home/Apt Avg. Price", fontsize=50, pad=30)
plt.figure(figsize=(10,10))
tmp = sortSuburb(listings, rt_list[2])
print_data = pd.DataFrame(tmp)
print_data.rename(columns={"count": "Available Rooms", "location": "Location", "price": "Avg. Price per night", "rtype": "Room Type"}, inplace=True)
print_data
# Plotting avg_cost_data on a graph
plot_data = pd.DataFrame(tmp)
plot_data
plt.plot(plot_data["location"], plot_data["price"])
plt.xticks(plot_data["location"], rotation='vertical', fontsize=25)
plt.ylabel("Price", fontsize=50, labelpad=50)
plt.title("Shared Room Avg. Price", fontsize=50, pad=30)
plt.figure(figsize=(20,10))
Get All Rental Locations Below Avg. Price in Sydney Suburb for Entire Home / Apt.
# Function to find the cheapest rental associated to the suburb and
# the given room type, returns a list on success
def sortCheapRentals(main_data, priceInfo, suburb, rtype):
output = []
maxLen = len(main_data["neighbourhood"])
min_date = datetime(2018, 1, 1)
for x in range(maxLen):
# Gets daily price
dprice = main_data["price"][x] / main_data["minimum_nights"][x]
# Check if date_str is not NULL object, if NULL continue
if isinstance(main_data["last_review"][x], str) != True:
continue
# Converts string into date type
curr_date = datetime.strptime(main_data["last_review"][x], "%Y-%m-%d")
# Check if neighbourhood matches
if main_data["neighbourhood"][x] == suburb:
# Check if room_type matches rtype, if not continue
if main_data["room_type"][x] != rtype:
continue
# Check if rental date is valid and price is lower than average
# If not valid continue
if curr_date < min_date:
continue
if main_data["availability_365"][x] >= 300 and dprice < findAvgPrice(priceInfo, suburb):
new_dict = {
"id" : main_data["id"][x],
"name" : main_data["name"][x],
"host_id": main_data["host_id"][x],
"location" : main_data["neighbourhood"][x],
"rtype" : main_data["room_type"][x],
"price" : round(dprice),
"location_avg_price": findAvgPrice(priceInfo, suburb),
"availability" : main_data["availability_365"][x],
"min_night" : main_data["minimum_nights"][x]
}
output.append(new_dict);
return output
# Function to find the avg price of a suburb from a given list
def findAvgPrice(priceInfo, suburb):
for x in range(len(priceInfo["location"])):
if priceInfo["location"][x] == suburb:
return priceInfo["price"][x]
return 0
# Call sortSuburbs to get average price info
priceInfo = sortSuburb(listings, rt_list[1])
priceInfo = pd.DataFrame(priceInfo)
sydney = sortCheapRentals(listings, priceInfo, "Sydney", rt_list[1])
sydney = pd.DataFrame(sydney)
sydney.rename(columns={"id": "Id", "location": "Location",
"price": "Price/Night", "rtype": "Room Type",
"host_id": "Host_Id", "location_avg_price" : "Location Avg. Price",
"availability": "Availability", "min_night" : "Minimum Night"},
inplace=True)
sydney
Gets List of Active Rentals (2018-2019)
# The difference between this function and the one above
# is that this one looks throughout the year 2018-2019
# Function to get the most recent rental of (year-2019)
# Returns a list of id's & associated geo data
def getRecentLocation(main_data, year):
min_date = datetime(year, 1, 1)
output = []
maxLen = len(main_data["last_review"])
for x in range(maxLen):
#Check if date_str is not NULL object, if NULL continue
if isinstance(main_data["last_review"][x], str) != True:
continue
# Converts string into date type
new_date = datetime.strptime(main_data["last_review"][x], "%Y-%m-%d")
if new_date >= min_date:
new_dict = {
"id" : main_data["id"][x],
"geometry" : main_data["geometry"][x],
"date" : main_data["last_review"][x]
}
output.append(new_dict)
return output
# Get information for recent reviews 2018-2019
recent_id = getRecentLocation(listings, 2018)
recent_id = pd.DataFrame(recent_id)
recent_geo = gp.GeoDataFrame(recent_id) # Convert recent_id into geopanda data frame
# IMPORTANT SETUP to ensure we are able to plot two DataFrame in the same graph
f, ax = plt.subplots(1, figsize=(40, 50))
ax.set_axis_off()
plt.axis('equal')
suburbs.plot(ax=ax, cmap='cool', linewidth=0.5)
recent_geo.plot(markersize=15, categorical=True, legend=True, ax=ax, cmap="winter",edgecolor='black');
plt.figtext(.5,.85,"All AirBnb Past Rental, 2018-2019",fontsize=50,ha='center')
Bar graph showing places with Highest Rent Count
# Function to get popularity of renting in diff neighbourhoods
# returns a list on success
def checkPopularity(main_data):
output = []
maxLen = len(main_data["neighbourhood"])
for x in range(maxLen):
if checkSameLocation(main_data["neighbourhood"][x], output) == 0:
new_dict = {
"location" : main_data["neighbourhood"][x],
"reviews" : main_data["number_of_reviews"][x],
}
output.append(new_dict)
elif checkSameLocation(main_data["neighbourhood"][x], output) == 1:
for i in output:
if i["location"] == main_data["neighbourhood"][x]:
i["reviews"] += main_data["number_of_reviews"][x]
return output
# To ensure we don't create duplicate neighbourhoods
def checkSameLocation(loc, out):
if not loc:
return 0
for x in out:
if x["location"] == loc:
return 1
return 0
popRanking = checkPopularity(listings)
popRanking = pd.DataFrame(popRanking)
# Plotting reviews on graph
plot_data = popRanking
plt.plot(plot_data["location"], plot_data["reviews"])
plt.xticks(plot_data["location"], rotation='vertical', fontsize=25)
plt.ylabel("No. of Reviews", fontsize=50, labelpad=50)
plt.title("Suburbs", fontsize=50, pad=30)
plt.figure(figsize=(20,10))